###############################################################################
# Data Preparation
###############################################################################
# This Script contains the code to build the necessary data on which all plots 
# are based as well as the analysis
###############################################################################
# Content
###############################################################################
# 1) Dependencies
# 2) Load Data
# 3) Combine SMD 2015 & 2019
# 4) Load NER Data Sets
# 5) Combine NER 2015 & 2019
# 6) Combine NER and SMD 
###############################################################################
# 1) Dependencies
###############################################################################
library(readr)
library(dplyr)
library(data.table)
library(tidyr)
library(lubridate)
###############################################################################
# 2) Load SMD Data Sets
###############################################################################
# Set Path
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
rm(list=ls())

# Load classified SMD Data from both 2015 & 2019
df15 <- readRDS('../data/2015/smd_all_curated_classified_sentiment.RDS') %>% 
            dplyr::select(-c(annotation_person, sm, nz)) %>% 
            mutate(pubDateTime = as.Date(pubDateTime, "%Y-%m-%d"))
df19 <- readRDS('../data/2019/smd_all_curated_classified_sentiment.RDS') %>% 
            dplyr::select(-c(annotation_person, annotation_geography, url))

df15 <- as_tibble(df15)
df19 <- as_tibble(df19)

names(df15)
names(df19)

# Numbers for Paper:
dim(df19)
dim(df15)

df19 %>% group_by(pubDateTime) %>% summarise(n = n()) %>% summarise(mean = mean(n),
                                                                    sd = sd(n))
df15 %>% group_by(pubDateTime) %>% summarise(n = n()) %>% summarise(mean = mean(n),
                                                                    sd = sd(n))
###############################################################################
# 3) Combine SMD 2015 & 2019
###############################################################################
# Add Year Var
df15$year <- 2015
df19$year <- 2019

# Add Date Var (without Year)
df19$date <- format(as.Date(df19$pubDateTime, "%Y-%m-%d"), format = "%m-%d")
df15$date <- format(as.Date(df15$pubDateTime, "%Y-%m-%d"), format = "%m-%d")

# Combine both Data Sets 
smd <- dplyr::bind_rows(df15, df19)
rm(df15,df19)
gc()

# Add ID to smd:
smd <- smd %>% dplyr::mutate(article_id = paste0(so,"_",sprintf("%07d",row_number())))

saveRDS(smd, "../data/smd_all_curated_classified_sentiment_2015_2019.RDS")
smd <- readRDS("../data/smd_all_curated_classified_sentiment_2015_2019.RDS")
###############################################################################
# 4) Load NER Data Sets
###############################################################################
ner15 <- readRDS('../data/2015/ner-SMD-annotated-corpus.RDS')

ner19 <- readRDS('../data/2019/ner-SMD-annotated-corpus.RDS')


ner15 <- ner15 %>% mutate(pubDateTime = as.Date(pubDateTime)) %>% mutate(year = 2015)

ner19 <- ner19 %>% dplyr::select(-c(age, annotation_geography, bundesrat, candidate, city, 
                             LINK_facebook, LINK_Instagram,  LINK_personal_website, LINK_Twitter)) %>% 
                   mutate(year = 2019)

names(ner15)
names(ner19)

# somehow in candidacy some values have different spaces hence the mutate at the beginning...
ner19 <- ner19 %>% mutate(candidacy = as.character(gsub("\\s", " ", council))) %>% 
                   mutate(council = case_when(candidacy %in% c("SR", "Former Staenderat", "Former Staenderat") ~ "sr",
                                              candidacy %in% c("NR", "Former Nationalrat", "Former Nationalrat") ~  "nr",
                                              candidacy %in% c("SR und NR", "NR und SR") ~ "sr & nr")) %>% 
                   dplyr::select(-c(candidacy))

unique(ner19$council)
unique(ner15$council)

# Calculate Age Variable for ner 2019 
ner19$age <- 2019 - ner19$year_of_birth 
# Removes Matthias Flückiger who has a completly wrong birth date...
ner19 <- ner19 %>% dplyr::filter(!year_of_birth %in% c(1900)) %>% dplyr::select(-year_of_birth)

unique(ner19$age)
unique(ner15$age)

sort(names(ner15))
sort(names(ner19))
###############################################################################
# 5) Combine NER 2015 & 2019
###############################################################################
ner <- dplyr::bind_rows(ner15,ner19)
ner <- ner %>% rename(canton = district)

rm(ner15,ner19)

ner <- ner %>% dplyr::rename(tx = fullTxt) %>% dplyr::select(-c(sentiment_value)) %>% 
               mutate(matched_person = 1)

saveRDS(ner, "../data/ner_SMD_2015_2019.RDS")
###############################################################################
# 6) Combine NER and SMD 
###############################################################################
# Merge ner and smd data 
df <- dplyr::left_join(smd, ner, by=c("so", "so_txt", "pubDateTime", "la", "tx", "selectsclass", "year"))
rm(smd)
# adjust names of Papers:
df <- df %>% 
  mutate(so_txt = case_when(
    so_txt %in% c("20 minuten online") ~ "20 minuten", 
    so_txt %in% c("Newsnet / 24 heures") ~ "24 heures",
    so_txt %in% c("Newsnet / Basler Zeitung") ~ "Basler Zeitung",
    so_txt %in% c("Newsnet / Berner Zeitung") ~ "Berner Zeitung",
    so_txt %in% c("Newsnet / Der Bund") ~ "Der Bund",
    so_txt %in% c("Newsnet / Le Matin") ~ "Le Matin",
    so_txt %in% c("Newsnet / Tribune de Genève", "Tribune de Genève") ~ "Tribune de Genève",
    so_txt %in% c("Newsnet / Tages-Anzeiger") ~ "Tages-Anzeiger",
    so_txt %in% c("Handelszeitung online") ~ "Handelszeitung",
    so_txt %in% c("RTS.ch") ~ "rts.ch",
    so_txt %in% c("SWI swissinfo.ch") ~ "swissinfo.ch",
    so_txt %in% c("Finanz und Wirtschaft Online") ~ "Finanz und Wirtschaft",
    so_txt %in% c("Anzeigen von Uster", "Anzegier von Uster") ~ "Anzeiger von Uster",
    so_txt %in% c("L'Agefi") ~ "Agefi",
    TRUE ~ so_txt
  ))

df <- df %>% mutate(gender = ifelse(gender == "F", "f", gender))
#-----------
# Numbers for Data & Methods part and for table 1
df %>% group_by(year) %>% summarise(unique = length(unique(so_txt)))
df %>% group_by(year) %>% summarise(unique = length(unique(fullname))) %>% mutate(unique_share = ifelse(year == 2015, unique/2888, unique/3599))
df %>% group_by(year, gender) %>% 
       filter(is.na(fullname) == F) %>% 
       summarise(unique = length(unique(fullname))) %>% 
       ungroup %>%  
       group_by(gender) %>% 
       mutate(unique_share = ifelse(year == 2015, unique/2888, unique/3599))

ner %>% group_by(year) %>% summarise(n = n())

ner %>% group_by(year) %>% summarise(n = length(unique(doc.id)))

df %>% group_by(council) %>% summarise(n = n())
# ----------



df <- mutate(df,
             selectsclass = ifelse(grepl('^PoliticalSystem', df$selectsclass)&(lengths(strsplit(df$selectsclass, ','))>1),
                                   strsplit(df$selectsclass, ',') %>% sapply(., '[', 2), 
                                   strsplit(df$selectsclass, ',') %>% sapply(., '[', 1)))

df <- df %>% dplyr::select(-c(tx))
df <- df %>% mutate(matched_person = ifelse(is.na(matched_person) == T, 0, matched_person))

topics_df <- df %>% mutate(selectsclass = ifelse(selectsclass %in% c("NotPolitical", "Not Classified", "Other_Problems"), "NotPolitical", selectsclass)) %>%
                    group_by(year, selectsclass) %>%
                    summarise(n = n()) %>%
                    mutate(freq =  n / sum(n))

topics_df <- df %>% mutate(selectsclass = ifelse(selectsclass %in% c("NotPolitical", "Not Classified", "Other_Problems"), "NotPolitical", selectsclass)) %>%
                    filter(!selectsclass %in% c("NotPolitical")) %>% 
                    group_by(year, selectsclass) %>%
                    summarise(n = n()) %>%
                    mutate(perc =  (n / sum(n) * 100)) %>% 
                    mutate(perc = round(perc, digits = 2)) %>%
                    pivot_wider(names_from = "year", values_from = c("n", "perc"))

stargazer::stargazer(topics_df, out = "../tables_main/topics_dist_2015_2019.tex", type = "latex", summary = FALSE, digits = 2)

saveRDS(df, "../data/smd_ner_2015_2019_combined.RDS")
